InĀ [75]:
# impoted the the library which is essential for data analysis
import pandas as pd
InĀ [77]:
#reading the dataset from the loation
data=pd.read_csv("C:\\Users\\jafer\\Desktop\\yellow_tripdata_2015-01.csv")
InĀ [79]:
#displaying first 5 values fromthe dataset
data.head()
Out[79]:
VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance pickup_longitude pickup_latitude RateCodeID dropoff_longitude dropoff_latitude payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount
0 2 15/01/2015 19:05 15/01/2015 19:23 1 1.59 -73.993896 40.750111 1 -73.974785 40.750618 1 12.0 1.0 0.5 3.25 0.0 0.3 17.05
1 1 10/01/2015 20:33 10/01/2015 20:53 1 3.30 -74.001648 40.724243 1 -73.994415 40.759110 1 14.5 0.5 0.5 2.00 0.0 0.3 17.80
2 1 10/01/2015 20:33 10/01/2015 20:43 1 1.80 -73.963341 40.802788 1 -73.951820 40.824413 2 9.5 0.5 0.5 0.00 0.0 0.3 10.80
3 1 10/01/2015 20:33 10/01/2015 20:35 1 0.50 -74.009087 40.713818 1 -74.004326 40.719986 2 3.5 0.5 0.5 0.00 0.0 0.3 4.80
4 1 10/01/2015 20:33 10/01/2015 20:52 1 3.00 -73.971176 40.762428 1 -74.004181 40.742653 2 15.0 0.5 0.5 0.00 0.0 0.3 16.30
InĀ [81]:
#displaying the shape od the dataset which is the number of rows and columns
#the number of rows are 12748986
#the number of columns are 19
data.shape
Out[81]:
(572712, 18)
InĀ [83]:
#which gives the complete information about the type of data which we have in our dataset
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 572712 entries, 0 to 572711
Data columns (total 18 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VendorID               572712 non-null  int64  
 1   tpep_pickup_datetime   572712 non-null  object 
 2   tpep_dropoff_datetime  572712 non-null  object 
 3   passenger_count        572712 non-null  int64  
 4   trip_distance          572712 non-null  float64
 5   pickup_longitude       572712 non-null  float64
 6   pickup_latitude        572712 non-null  float64
 7   RateCodeID             572712 non-null  int64  
 8   dropoff_longitude      572712 non-null  float64
 9   dropoff_latitude       572712 non-null  float64
 10  payment_type           572712 non-null  int64  
 11  fare_amount            572712 non-null  float64
 12  extra                  572712 non-null  float64
 13  mta_tax                572712 non-null  float64
 14  tip_amount             572712 non-null  float64
 15  tolls_amount           572712 non-null  float64
 16  improvement_surcharge  572712 non-null  float64
 17  total_amount           572712 non-null  float64
dtypes: float64(12), int64(4), object(2)
memory usage: 78.7+ MB
InĀ [85]:
#! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
InĀ [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling
#import pandas_profiling
#from pandas_profiling import ProfileReport
from pandas_profiling import  ydata_profiling
InĀ [89]:
data=pd.read_csv("C:\\Users\\jafer\\Desktop\\yellow_tripdata_2015-01.csv")
InĀ [91]:
data
Out[91]:
VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance pickup_longitude pickup_latitude RateCodeID dropoff_longitude dropoff_latitude payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount
0 2 15/01/2015 19:05 15/01/2015 19:23 1 1.59 -73.993896 40.750111 1 -73.974785 40.750618 1 12.0 1.0 0.5 3.25 0.0 0.3 17.05
1 1 10/01/2015 20:33 10/01/2015 20:53 1 3.30 -74.001648 40.724243 1 -73.994415 40.759110 1 14.5 0.5 0.5 2.00 0.0 0.3 17.80
2 1 10/01/2015 20:33 10/01/2015 20:43 1 1.80 -73.963341 40.802788 1 -73.951820 40.824413 2 9.5 0.5 0.5 0.00 0.0 0.3 10.80
3 1 10/01/2015 20:33 10/01/2015 20:35 1 0.50 -74.009087 40.713818 1 -74.004326 40.719986 2 3.5 0.5 0.5 0.00 0.0 0.3 4.80
4 1 10/01/2015 20:33 10/01/2015 20:52 1 3.00 -73.971176 40.762428 1 -74.004181 40.742653 2 15.0 0.5 0.5 0.00 0.0 0.3 16.30
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
572707 1 29/01/2015 19:55 29/01/2015 20:17 1 4.90 -74.001946 40.715950 1 -73.953331 40.771603 1 18.0 1.0 0.5 4.95 0.0 0.3 24.75
572708 1 29/01/2015 19:55 29/01/2015 20:01 1 1.20 -73.983826 40.749725 1 -73.995209 40.739670 2 6.0 0.5 0.5 0.00 0.0 0.3 7.30
572709 1 29/01/2015 19:55 29/01/2015 19:59 1 0.80 -73.966621 40.764755 1 -73.963760 40.773510 2 5.0 0.5 0.5 0.00 0.0 0.3 6.30
572710 1 29/01/2015 19:55 29/01/2015 19:59 1 0.90 -73.997810 40.756504 1 -73.987839 40.764717 1 5.0 1.0 0.5 1.35 0.0 0.3 8.15
572711 1 29/01/2015 19:55 29/01/2015 20:18 1 6.50 -73.952454 40.777096 1 -74.004990 40.731491 1 23.0 1.0 0.5 2.00 0.0 0.3 26.80

572712 rows Ɨ 18 columns

InĀ [93]:
Profile=ProfileReport(data,title='Pandas Profiling Report',explorative=True)
InĀ [33]:
Profile#.to_notebook_iframe()
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[33]:

InĀ [96]:
#Profile.to_file(output_file="output.html")
Profile.to_file("output.html")
Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]